library(sf)
library(dplyr)
library(tidyr)
library(janitor)
library(ggplot2)
library(viridis)
library(leaflet)
provinces_full <- readRDS("../data_clean/provinces_full.rds")
prov_df <- prov %>% st_drop_geometry()
summary_stats <- prov_df %>%
summarise(
mort_mean = mean(diabetes_mort_rate, na.rm = TRUE),
mort_sd = sd(diabetes_mort_rate, na.rm = TRUE),
mort_min = min(diabetes_mort_rate, na.rm = TRUE),
mort_max = max(diabetes_mort_rate, na.rm = TRUE),
age_mean = mean(pct_65plus, na.rm = TRUE),
unemp_mean = mean(unemployment_rate, na.rm = TRUE),
edu_mean = mean(low_education_share, na.rm = TRUE),
nutr_mean = mean(adequate_nutrition, na.rm = TRUE),
sed_mean = mean(sedentariness, na.rm = TRUE)
)
summary_stats
## mort_mean mort_sd mort_min mort_max age_mean unemp_mean edu_mean nutr_mean sed_mean
## 1 4.118037 1.440078 1.85 9.08 25.00153 6.677508 48.26879 16.89252 33.74673
## 3.1 Diabetes mortality rate ####
# most affective:
prov_df %>%
arrange(desc(diabetes_mort_rate)) %>%
select(prov_name, diabetes_mort_rate) %>%
slice(1:10)
## prov_name diabetes_mort_rate
## 1 Agrigento 9.08
## 2 Enna 7.71
## 3 Reggio di Calabria 7.35
## 4 Isernia 7.29
## 5 Crotone 7.15
## 6 Cosenza 6.78
## 7 Trapani 6.35
## 8 Catanzaro 6.33
## 9 Caltanissetta 6.32
## 10 Potenza 6.12
# least effective:
prov_df %>%
arrange(diabetes_mort_rate) %>%
select(prov_name, diabetes_mort_rate) %>%
slice(1:10)
## prov_name diabetes_mort_rate
## 1 Bolzano 1.85
## 2 Lecco 2.01
## 3 Trento 2.11
## 4 Aosta 2.20
## 5 Sondrio 2.24
## 6 Monza e della Brianza 2.27
## 7 Brescia 2.33
## 8 Prato 2.43
## 9 Reggio nell'Emilia 2.46
## 10 Bergamo 2.48
# we can already see that provinces located in the south have greater diabetes mortality rate.
## 3.2 Ageing ####
# most affective:
prov_df %>%
arrange(desc(pct_65plus)) %>%
select(prov_name, pct_65plus) %>%
slice(1:10)
## prov_name pct_65plus
## 1 Biella 29.8925
## 2 Savona 29.7446
## 3 Genova 29.0450
## 4 Oristano 28.8914
## 5 Grosseto 28.7334
## 6 Ferrara 28.7055
## 7 Trieste 28.6880
## 8 Terni 28.5791
## 9 Alessandria 28.4154
## 10 Imperia 28.3253
# least effective:
prov_df %>%
arrange(pct_65plus) %>%
select(prov_name, pct_65plus) %>%
slice(1:10)
## prov_name pct_65plus
## 1 Caserta 18.9922
## 2 Napoli 19.5782
## 3 Bolzano 20.2766
## 4 Barletta-Andria-Trani 21.1191
## 5 Ragusa 21.2972
## 6 Catania 21.5232
## 7 Crotone 22.0505
## 8 Bergamo 22.1082
## 9 Salerno 22.3033
## 10 Lodi 22.3805
## 3.3 Unemployment ####
# most affective:
prov_df %>%
arrange(desc(unemployment_rate)) %>%
select(prov_name, unemployment_rate) %>%
slice(1:10)
## prov_name unemployment_rate
## 1 Napoli 20.36696
## 2 Messina 16.44512
## 3 Foggia 16.34486
## 4 Agrigento 16.20707
## 5 Catanzaro 15.93901
## 6 Siracusa 15.46265
## 7 Palermo 14.66749
## 8 Cosenza 14.11287
## 9 Caltanissetta 13.99437
## 10 Vibo Valentia 13.60034
# least effective:
prov_df %>%
arrange(unemployment_rate) %>%
select(prov_name, unemployment_rate) %>%
slice(1:10)
## prov_name unemployment_rate
## 1 Bergamo 1.545735
## 2 Pordenone 1.782660
## 3 Cremona 1.900585
## 4 Bolzano 1.990011
## 5 Treviso 2.327501
## 6 Prato 2.553198
## 7 Padova 2.558400
## 8 Lodi 2.582747
## 9 Verona 2.589108
## 10 Firenze 2.688453
# same thing: unemplyment rate is clearly greater in the south
## 3.4 education ####
edu_region <- prov_df %>%
distinct(region_name, low_education_share)
# Most affected (highest low education share)
edu_region %>%
arrange(desc(low_education_share)) %>%
slice(1:10)
## region_name low_education_share
## 1 Puglia 56.62414
## 2 Sardegna 55.62087
## 3 Sicilia 55.44252
## 4 Calabria 53.77098
## 5 Campania 52.95224
## 6 Valle d'Aosta 49.14710
## 7 Molise 48.90983
## 8 Piemonte 48.33849
## 9 Basilicata 47.95307
## 10 Toscana 47.44715
# Least affected (lowest low education share)
edu_region %>%
arrange(low_education_share) %>%
slice(1:10)
## region_name low_education_share
## 1 Lazio 40.12169
## 2 Umbria 42.54312
## 3 Friuli-Venezia Giulia 42.82400
## 4 Liguria 43.59771
## 5 Trentino-Alto Adige 43.92755
## 6 Emilia-Romagna 44.06597
## 7 Abruzzo 44.96981
## 8 Lombardia 45.81564
## 9 Veneto 47.02004
## 10 Marche 47.12458
## 3.5 Nutrition and sedentariness ####
nut_region <- prov_df %>%
distinct(region_name, adequate_nutrition)
# Best (highest adequate nutrition)
nut_region %>%
arrange(desc(adequate_nutrition)) %>%
slice(1:10)
## region_name adequate_nutrition
## 1 Piemonte 25.4
## 2 Marche 23.0
## 3 Emilia-Romagna 21.9
## 4 Friuli-Venezia Giulia 19.4
## 5 Valle d'Aosta 18.9
## 6 Liguria 18.8
## 7 Toscana 18.8
## 8 Lombardia 18.1
## 9 Lazio 17.9
## 10 Trentino-Alto Adige 17.6
# Worst (lowest adequate nutrition)
nut_region %>%
arrange(adequate_nutrition) %>%
slice(1:10)
## region_name adequate_nutrition
## 1 Basilicata 7.1
## 2 Campania 9.9
## 3 Sicilia 10.1
## 4 Puglia 11.1
## 5 Molise 11.6
## 6 Calabria 12.9
## 7 Abruzzo 14.4
## 8 Veneto 14.6
## 9 Umbria 17.2
## 10 Sardegna 17.5
# the same here, the south has the lowest adequate nutrition
sed_region <- prov_df %>%
distinct(region_name, sedentariness)
# Best (highest sedentariness)
sed_region %>%
arrange(desc(sedentariness)) %>%
slice(1:10)
## region_name sedentariness
## 1 Basilicata 53.7
## 2 Campania 53.1
## 3 Sicilia 52.5
## 4 Puglia 48.6
## 5 Calabria 48.2
## 6 Molise 38.9
## 7 Sardegna 34.8
## 8 Lazio 32.0
## 9 Abruzzo 31.5
## 10 Umbria 30.5
# Worst (lowest sedentariness)
sed_region %>%
arrange(sedentariness) %>%
slice(1:10)
## region_name sedentariness
## 1 Trentino-Alto Adige 13.8
## 2 Friuli-Venezia Giulia 22.6
## 3 Veneto 23.1
## 4 Lombardia 25.5
## 5 Emilia-Romagna 26.2
## 6 Valle d'Aosta 26.4
## 7 Marche 28.8
## 8 Piemonte 29.1
## 9 Toscana 29.1
## 10 Liguria 29.6
# the same
# Helper to create quantile-based groups with nice labels
qcut <- function(x, probs, labels) {
cut(x,
breaks = quantile(x, probs = probs, na.rm = TRUE),
include.lowest = TRUE,
labels = labels)
}
prov_cat <- prov_df %>%
mutate(
# Outcome: mortality quartiles
mort_q = qcut(diabetes_mort_rate,
probs = c(0, .25, .50, .75, 1),
labels = c("Q1 (lowest)", "Q2", "Q3", "Q4 (highest)")),
# Predictors: tertiles (Low/Medium/High)
age_ter = qcut(pct_65plus,
probs = c(0, 1/3, 2/3, 1),
labels = c("Low", "Medium", "High")),
unemp_ter = qcut(unemployment_rate,
probs = c(0, 1/3, 2/3, 1),
labels = c("Low", "Medium", "High")),
edu_ter = qcut(low_education_share,
probs = c(0, 1/3, 2/3, 1),
labels = c("Low", "Medium", "High")),
nutr_ter = qcut(adequate_nutrition,
probs = c(0, 1/3, 2/3, 1),
labels = c("Low", "Medium", "High")),
sed_ter = qcut(sedentariness,
probs = c(0, 1/3, 2/3, 1),
labels = c("Low", "Medium", "High"))
)
crosstab_mort <- function(data, var) {
data %>%
count(mort_q, {{ var }}) %>%
group_by(mort_q) %>%
mutate(pct_within_mort = n / sum(n) * 100) %>%
ungroup() %>%
arrange(mort_q, {{ var }})
}
tab_age <- crosstab_mort(prov_cat, age_ter)
tab_age
## # A tibble: 12 × 4
## mort_q age_ter n pct_within_mort
## <fct> <fct> <int> <dbl>
## 1 Q1 (lowest) Low 15 55.6
## 2 Q1 (lowest) Medium 10 37.0
## 3 Q1 (lowest) High 2 7.41
## 4 Q2 Low 5 17.9
## 5 Q2 Medium 10 35.7
## 6 Q2 High 13 46.4
## 7 Q3 Low 3 12
## 8 Q3 Medium 5 20
## 9 Q3 High 17 68
## 10 Q4 (highest) Low 13 48.1
## 11 Q4 (highest) Medium 10 37.0
## 12 Q4 (highest) High 4 14.8
Here the question is: “within each diabetes mortality quartile, how are provinces distributed across age structure (Low / Medium / High %65+)?”
Q1: among provinces with the lowest diabetes mortality, more than half have a low share of elderly, and very few have a high share of elderly.
Q2: in slightly higher mortality provinces, the majority already have medium-to-high ageing.
Q3: for provinces in the third mortality quartile, over two thirds belong to the highest ageing group
Q4: descending
Provinces with very high diabetes mortality are not dominated by highly aged populations. Ageing explains diabetes mortality up to a point, but cannot explain the highest mortality levels alone.
tab_unem <- crosstab_mort(prov_cat, unemp_ter)
tab_unem
## # A tibble: 11 × 4
## mort_q unemp_ter n pct_within_mort
## <fct> <fct> <int> <dbl>
## 1 Q1 (lowest) Low 19 70.4
## 2 Q1 (lowest) Medium 6 22.2
## 3 Q1 (lowest) High 2 7.41
## 4 Q2 Low 12 42.9
## 5 Q2 Medium 12 42.9
## 6 Q2 High 4 14.3
## 7 Q3 Low 5 20
## 8 Q3 Medium 12 48
## 9 Q3 High 8 32
## 10 Q4 (highest) Medium 5 18.5
## 11 Q4 (highest) High 22 81.5
Q1: 70.4% Low unemployment: Very few high-unemployment provinces
Q2: Balanced Low / Medium unemployment, Still few high-unemployment provinces
Q3: 32% High unemployment, Nearly half Medium, Unemployment starts to matter more
Q4: 81.5% High unemployment, Almost no low-unemployment provinces
Provinces with the highest diabetes mortality are overwhelmingly concentrated in the highest unemployment tertile, suggesting a strong socio-economic gradient that is not explained by age alone.
tab_edu <- crosstab_mort(prov_cat, edu_ter)
tab_edu
## # A tibble: 12 × 4
## mort_q edu_ter n pct_within_mort
## <fct> <fct> <int> <dbl>
## 1 Q1 (lowest) Low 18 66.7
## 2 Q1 (lowest) Medium 7 25.9
## 3 Q1 (lowest) High 2 7.41
## 4 Q2 Low 11 39.3
## 5 Q2 Medium 15 53.6
## 6 Q2 High 2 7.14
## 7 Q3 Low 10 40
## 8 Q3 Medium 8 32
## 9 Q3 High 7 28
## 10 Q4 (highest) Low 3 11.1
## 11 Q4 (highest) Medium 2 7.41
## 12 Q4 (highest) High 22 81.5
Q1: 66.7% Low low-education share, very few High: better education = lower mortality
Q2, Q3: Increasing presence of Medium & High, gradient is visible but smoother
Q4: 81.5% High low-education share –> Strong structural association
The highest mortality quartile is strongly associated with regions exhibiting a high proportion of low-educated population, highlighting the role of long-term structural socio-economic disadvantages.
tab_nutr <- crosstab_mort(prov_cat, nutr_ter)
tab_nutr
## # A tibble: 11 × 4
## mort_q nutr_ter n pct_within_mort
## <fct> <fct> <int> <dbl>
## 1 Q1 (lowest) Low 3 11.1
## 2 Q1 (lowest) Medium 14 51.9
## 3 Q1 (lowest) High 10 37.0
## 4 Q2 Low 4 14.3
## 5 Q2 Medium 12 42.9
## 6 Q2 High 12 42.9
## 7 Q3 Low 8 32
## 8 Q3 Medium 12 48
## 9 Q3 High 5 20
## 10 Q4 (highest) Low 25 92.6
## 11 Q4 (highest) Medium 2 7.41
Q1: Mostly Medium–High nutrition, Only 11% Low nutrition –> nutrition has a protective effect
Q2, Q3: Mixed distribution
Q4: 92.6% Low adequate nutrition
Provinces with the highest diabetes mortality are almost exclusively located in regions characterized by low levels of adequate nutrition, reinforcing the role of diet-related factors in diabetes-related mortality.
tab_sed <- crosstab_mort(prov_cat, sed_ter)
tab_sed
## # A tibble: 11 × 4
## mort_q sed_ter n pct_within_mort
## <fct> <fct> <int> <dbl>
## 1 Q1 (lowest) Low 25 92.6
## 2 Q1 (lowest) Medium 1 3.70
## 3 Q1 (lowest) High 1 3.70
## 4 Q2 Low 10 35.7
## 5 Q2 Medium 16 57.1
## 6 Q2 High 2 7.14
## 7 Q3 Low 5 20
## 8 Q3 Medium 13 52
## 9 Q3 High 7 28
## 10 Q4 (highest) Medium 3 11.1
## 11 Q4 (highest) High 24 88.9
Q1: 92.6% Low sedentariness, Low mortality provinces are almost entirely in low-sedentary regions
Q2, Q3: Mostly Medium sedentariness, Some High in Q3 –> Clear gradient emerging
Q4: 88.9% High sedentariness –> Extremely strong concentration
Provinces in the highest mortality quartile are almost entirely located in regions with high levels of sedentary behavior, indicating that lifestyle-related risk factors play a key role in explaining extreme mortality outcomes.
Diabetes Mortality Rate
### 5.1.1 diabetes mortality rate ####
ggplot(provinces_full) +
geom_sf(aes(fill = diabetes_mort_rate), color = "grey30", size = 0.1) +
scale_fill_viridis(
name = "Diabetes mortality rate",
option = "plasma",
direction = -1
) +
labs(
title = "Diabetes mortality rate by province",
subtitle = "Quantile classification",
caption = "Source: ISTAT"
) +
theme_minimal()
ggsave("../outputs/diabetes_mortality_rate.png", width = 6, height = 4, dpi = 300)
Ageing
### 5.1.2 Ageing ####
ggplot(provinces_full) +
geom_sf(aes(fill = pct_65plus), color = "grey30", size = 0.1) +
scale_fill_viridis(
name = "65+ (%)",
option = "magma",
) +
labs(
title = "Percentage of 65+ by province"
) +
theme_minimal()
ggsave("../outputs/pct_65+.png", width = 6, height = 4, dpi = 300)
Unemployment Rate
### 5.1.3 Unemplyment rate ####
ggplot(provinces_full) +
geom_sf(aes(fill = unemployment_rate), color = "grey30", size = 0.1) +
scale_fill_viridis(
name = "Unemployment rate (%)",
option = "magma",
direction = -1
) +
labs(
title = "Unemployment rate by province"
) +
theme_minimal()
ggsave("../outputs/unemployment_rate.png", width = 6, height = 4, dpi = 300)
Education
### 5.1.4 Low Education Share ####
ggplot(provinces_full) +
geom_sf(aes(fill = low_education_share), color = "grey30", size = 0.1) +
scale_fill_viridis(
name = "Low Education Share (%)",
option = "mako",
direction = -1
) +
labs(
title = "Low Education Share (regional indicator)",
subtitle = "Same value for provinces within the same region"
) +
theme_minimal()
ggsave("../outputs/low_education_share.png", width = 6, height = 4, dpi = 300)
Adequate Nutrition
### 5.1.4 Adequate Nutrition ####
ggplot(provinces_full) +
geom_sf(aes(fill = adequate_nutrition), color = "grey30", size = 0.1) +
scale_fill_viridis(
name = "Adequate nutrition (%)",
option = "viridis",
) +
labs(
title = "Adequate nutrition (regional indicator)",
subtitle = "Same value for provinces within the same region"
) +
theme_minimal()
ggsave("../outputs/adequate_nutrition.png", width = 6, height = 4, dpi = 300)
Sedentariness
### 5.1.5 Sedentariness ####
ggplot(provinces_full) +
geom_sf(aes(fill = sedentariness), color = "grey30", size = 0.1) +
scale_fill_viridis(
name = "Sedentary population (%)",
option = "inferno",
direction = -1
) +
labs(
title = "Sedentariness (regional indicator)",
subtitle = "Same value for provinces within the same region"
) +
theme_minimal()
ggsave("../outputs/sedentariness.png", width = 6, height = 4, dpi = 300)
st_crs(provinces_full)
## Coordinate Reference System:
## User input: WGS 84 / UTM zone 32N
## wkt:
## PROJCRS["WGS 84 / UTM zone 32N",
## BASEGEOGCRS["WGS 84",
## DATUM["World Geodetic System 1984",
## ELLIPSOID["WGS 84",6378137,298.257223563,
## LENGTHUNIT["metre",1]]],
## PRIMEM["Greenwich",0,
## ANGLEUNIT["degree",0.0174532925199433]],
## ID["EPSG",4326]],
## CONVERSION["UTM zone 32N",
## METHOD["Transverse Mercator",
## ID["EPSG",9807]],
## PARAMETER["Latitude of natural origin",0,
## ANGLEUNIT["Degree",0.0174532925199433],
## ID["EPSG",8801]],
## PARAMETER["Longitude of natural origin",9,
## ANGLEUNIT["Degree",0.0174532925199433],
## ID["EPSG",8802]],
## PARAMETER["Scale factor at natural origin",0.9996,
## SCALEUNIT["unity",1],
## ID["EPSG",8805]],
## PARAMETER["False easting",500000,
## LENGTHUNIT["metre",1],
## ID["EPSG",8806]],
## PARAMETER["False northing",0,
## LENGTHUNIT["metre",1],
## ID["EPSG",8807]]],
## CS[Cartesian,2],
## AXIS["(E)",east,
## ORDER[1],
## LENGTHUNIT["metre",1]],
## AXIS["(N)",north,
## ORDER[2],
## LENGTHUNIT["metre",1]],
## ID["EPSG",32632]]
provinces_leaflet <- st_transform(provinces_full, 4326)
mort_breaks <- quantile(
provinces_leaflet$diabetes_mort_rate,
probs = seq(0, 1, 0.2),
na.rm = TRUE
)
pal_mort <- colorBin(
palette = "YlOrRd",
domain = provinces_leaflet$diabetes_mort_rate,
bins = mort_breaks
)
# interactive map
leaflet(provinces_leaflet) %>%
addProviderTiles("CartoDB.Positron") %>%
addPolygons(
fillColor = ~pal_mort(diabetes_mort_rate),
weight = 0.4,
fillOpacity = 0.8,
label = ~paste0(prov_name, " – ", region_name),
popup = ~paste0(
"<b>", prov_name, "</b><br>",
"<i>", region_name, "</i><br><br>",
"<b>Diabetes mortality:</b> ",
round(diabetes_mort_rate, 1), "<br><br>",
"<b>Population 65+:</b> ",
round(pct_65plus, 1), "%<br>",
"<b>Unemployment:</b> ",
round(unemployment_rate, 1), "%<br><br>",
"<b>Sedentariness (region):</b> ",
sedentariness, "%<br>",
"<b>Adequate nutrition (region):</b> ",
adequate_nutrition, "%<br>",
"<b>Low education share (region):</b> ",
round(low_education_share, 1), "%"
)
) %>%
addLegend(
pal = pal_mort,
values = ~diabetes_mort_rate,
title = "Diabetes mortality rate<br>(per 10,000)",
labFormat = labelFormat(digits = 1),
opacity = 0.9,
position = "bottomright"
)